1 Single-Cell Data Surgery: Cynthia Sandor’s Data

1.1 Aims for this dataset

QC the data appropriately and then find genes differentially expressed between cells with different genotypes.

1.2 Tasks for this dataset

  1. Load data then QC cells and genes
  2. Normalise data
  3. Investigate and correct batch effects
  4. Visualise data [throughout]
  5. Explore methods for differential expression (DE) analysis

2 QC

2.1 Task: QC cells and genes

  • Load the data into an SCESet object
  • QC cells using appropriate functions in scater
  • Filter out genes with very low average expression

2.2 Load data into SCESet

load("cs_scater_workshop.RData")
pd <- new("AnnotatedDataFrame", scater_wt_cell)
sce <- newSCESet(countData = scater_wt_counts, phenoData = pd)
sce
## SCESet (storageMode: lockedEnvironment)
## assayData: 57865 features, 384 samples 
##   element names: counts, exprs 
## protocolData: none
## phenoData
##   sampleNames: S35P1P160125UOXZP21C05 S78P3P160125UOXCP21F10 ...
##     S67P1P160125UOXZP21C09 (384 total)
##   varLabels: Cell Genotype ... cDNA_ng_per_ul (7 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:

Expression values are log2(cpm + 1).

2.3 Get gene annotations

sce <- getBMFeatureAnnos(
    sce, filters = "ensembl_gene_id",
    attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", 
                   "start_position", "end_position", "strand", "gene_biotype"),
    feature_symbol = "hgnc_symbol",
    feature_id = "ensembl_gene_id", biomart = "ENSEMBL_MART_ENSEMBL",
    dataset = "hsapiens_gene_ensembl", host = "www.ensembl.org")
head(fData(sce), n = 1)
##                 ensembl_gene_id hgnc_symbol chromosome_name start_position
## ENSG00000223972 ENSG00000223972     DDX11L1               1          11869
##                 end_position strand                       gene_biotype
## ENSG00000223972        14409      1 transcribed_unprocessed_pseudogene
##                 feature_symbol      feature_id
## ENSG00000223972        DDX11L1 ENSG00000223972

2.4 View cell metadata

head(pData(sce))
##                                          Cell Genotype            Lab
## S35P1P160125UOXZP21C05 S35P1P160125UOXZP21C05    PSEN1            Zam
## S78P3P160125UOXCP21F10 S78P3P160125UOXCP21F10    PSEN1 Colin_Ackerman
## S44P1P160125UOXZP21D06 S44P1P160125UOXZP21D06    PSEN1            Zam
## S48P2P160125UOXCC21H06 S48P2P160125UOXCC21H06  control Colin_Ackerman
## S71P4P160125UOXZC21G09 S71P4P160125UOXZC21G09  control            Zam
## S24P3P160125UOXCP21H03 S24P3P160125UOXCP21H03    PSEN1 Colin_Ackerman
##                        cell_bulk Induction FACS_Replicate cDNA_ng_per_ul
## S35P1P160125UOXZP21C05      cell         2              1           0.24
## S78P3P160125UOXCP21F10      cell         2              1           0.29
## S44P1P160125UOXZP21D06      cell         2              1           0.36
## S48P2P160125UOXCC21H06      cell         2              1           0.36
## S71P4P160125UOXZC21G09      cell         2              1           0.34
## S24P3P160125UOXCP21H03      cell         2              1           0.58
plot(sce, block1 = "Lab", block2 = "Genotype", colour_by = "cell_bulk",
     exprs_values = "counts")

2.5 Calculate QC metrics

#Two sets of feature controls: ERCC spike ins & mitochondrial genes
fctrls <- featureNames(sce)[grepl("ERCC-", featureNames(sce))]
fctrls2 <- featureNames(sce)[grepl("MT", fData(sce)$chromosome_name)]
#Cell controls are bulks and empty wells
cctrls <- cellNames(sce)[pData(sce)$cell_bulk == "bulk"]
#Set threshold of 80% of reads from feature controls to flag cell
sce <- calculateQCMetrics(
    sce,  
    feature_controls = list(ERCC_ctrl = fctrls, mt_ctrl = fctrls2),
    cell_controls = list(bulks = cctrls),
    nmads = 8, pct_feature_controls_threshold = 80)
plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = total_features,
                       colour = cell_bulk, size = log10_total_counts)) +
    geom_hline(yintercept = 2500, linetype = 2) + 
    geom_vline(xintercept = 50, linetype = 2)

plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = cDNA_ng_per_ul,
                       colour = cell_bulk, size = log10_total_counts)) +
    geom_vline(xintercept = 50, linetype = 2) + coord_cartesian(ylim = c(0, 2.2))

plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = pct_counts_feature_controls,
                       colour = cell_bulk, size = log10_total_counts)) +
    stat_smooth(colour = "firebrick2", linetype = 2) + geom_hline(yintercept = 13, linetype = 2)

sce <- plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "cell_bulk", 
                   pca_data_input = "pdata", detect_outliers = TRUE,
                   return_SCESet = TRUE, selected_variables = c("cDNA_ng_per_ul", "pct_counts_top_200_features", "total_features", "pct_counts_feature_controls", "n_detected_feature_controls", "log10_counts_endogenous_features", "log10_counts_feature_controls"))

plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "cell_bulk",
        colour_by = "outlier") + ggtitle("PCA using expression values")

plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "Lab",
        colour_by = "Genotype") + ggtitle("PCA using expression values")

plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "Genotype",
        colour_by = "Lab") + ggtitle("PCA using expression values")

2.6 Filter cells

sce$use <- (sce$total_features > 2000 & # sufficient genes detected
                sce$pct_counts_top_200_features < 50 & # sufficient library complexity
                sce$pct_counts_feature_controls < 14 & # sufficient endogenous RNA
                sce$total_counts > 1e5 & # sufficient reads mapped to features
                !sce$filter_on_total_features & # remove cells with unusual numbers of genes
                !sce$is_cell_control # controls shouldn't be used in downstream analysis
)
knitr::kable(as.data.frame(table(sce$use)))
Var1 Freq
FALSE 39
TRUE 345
plotPCA(sce[, sce$use], size_by = "pct_counts_top_200_features", shape_by = "Genotype",
        colour_by = "Lab") + ggtitle("PCA after filtering cells")

plotQC(sce[, sce$use], type = "find-pcs", variable = "Lab" )

plotQC(sce[, sce$use], type = "find-pcs", variable = "Genotype" )

2.7 PlotQC: most highly expressed genes

plotQC(sce, type = "highest-expression", exprs_values = "counts", n = 20) +
    theme(axis.text.y = element_text(size = 10),
              axis.title = element_text(size = 11))

2.8 PlotQC: expression frequency against mean

plotQC(sce, type = "exprs-freq-vs-mean", feature_controls = fctrls)

2.9 Filter genes

keep_gene <- rowMeans(counts(sce)) >= 1
fData(sce)$use <- keep_gene
knitr::kable(as.data.frame(table(keep_gene)))
keep_gene Freq
FALSE 45058
TRUE 12807

3 Normalise expression data

3.1 Task: Scaling normalisation

  • Use the methods from the scran package to apply scaling normalisation to the data.
  • Look at the functions isSpike, computeSumFactors, computeSpikeFactors and normalise

3.2 Scaling normalisation

library(scran)
sce <- sce[fData(sce)$use, sce$use]
isSpike(sce) <- "ERCC_ctrl"
sce <- computeSumFactors(sce, sizes = c(20, 40, 60, 80))
summary(sizeFactors(sce))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3077  0.6856  0.9835  1.0000  1.2540  2.8190
sce <- computeSpikeFactors(sce, type = "ERCC_ctrl", general.use = FALSE)
sce <- normalise(sce, return_norm_as_exprs = FALSE)
p1 <- plotPCA(sce, size_by = "total_features", shape_by = "Genotype",
        colour_by = "Lab") + ggtitle("PCA before normalisation")
p2 <- plotPCA(sce, size_by = "total_features", shape_by = "Genotype",
        colour_by = "Lab", exprs_values = "norm_exprs") + ggtitle("PCA after normalisation")
multiplot(p1, p2, cols = 2)

3.3 Important explanatory variables

plotQC(sce, type = "expl", 
        variables = c("pct_counts_top_100_features", "total_features", 
                      "pct_counts_feature_controls", "Lab",
                      "n_detected_feature_controls", "Genotype",
                      "log10_counts_endogenous_features",
                      "log10_counts_feature_controls"))

4 Batch effect

4.1 Batch effect

The PCA plots above show that scaling normalisation does not remove batch effects (this is beyond its scope).

Options for dealing with batch effects (in order of preference):

  1. Design the experiment to avoid them
  2. Model them explicitly
  3. Adjust/correct data to remove or ameliorate them
  • Here we will address (3)

4.2 Correcting for batch effects

Options:

  • Using a method to identify and remove hidden/latent factors from the data. I have had good results with the RUVs method from the RUVSeq package.
  • Regressing out known factors [shown here].

4.3 Task: Regress out Lab effect

  • Use scater’s normaliseExprs() function to regress out the Lab effect.
  • Compare results of this adjustment to previous normalisation results.

4.4 Regressing out known factors

design <- model.matrix(~sce$Lab)
set_exprs(sce, "norm_exprs_resid") <- norm_exprs(
    normaliseExprs(sce, design = design,
                   method = "none", exprs_values = "exprs",
                   return_norm_as_exprs = FALSE))
p3 <- plotPCA(sce, exprs_values = "norm_exprs_resid", shape_by = "Genotype", 
    colour_by = "Lab", size_by = "total_features") + 
    ggtitle("PCA - size-factor normalisation residuals")
multiplot(p2, p3, cols = 2)

plotTSNE(sce, exprs_values = "norm_exprs_resid", shape_by = "Lab", 
    colour_by = "Genotype", size_by = "total_features", rand_seed = 5) + 
    ggtitle("t-SNE - size-factor normalisation residuals")

5 Differential Expression Analysis

5.1 Do we need to filter genes/libraries beore DE analysis?

  • Yes
  • Filter libraries we believe to be problematic, as done in our QC previously
  • We can increase power to detect DE genes (reducing multiple-testing burden) by filtering genes, BUT
  • We have to be careful with gene filtering: must not filter on gene attributes associated with DE testing itself, or we will bias results and p-values/FDR will no longer be accurate
  • Generally safe and useful approach is to filter out genes with low expression in a way that is blind to which group/experimental condition/etc they belong to

5.2 Tools for DE analysis

  • edgeR [Bioc]: use trend.method="none" argument in estimateDisp call
  • scde [Bioc]
  • M3Drop [Bioc]
  • MAST [devtools::install_github("RGLab/MAST@summarizedExpt"")]
  • BPSC [devtools::install_github(“nghiavtr/BPSC”)]
  • scDD [devtools::install_github(“kdkorthauer/scDD”)]

5.3 What’s the best method for DE?

It depends…

5.4 Our methodological desires…

We To find genes DE between PSEN1 and control cells accounting for Lab effects (at least). So we want the following in a DE method/tool:

  1. can handle general experimental designs
  2. accounts for characteristics of single-cell expression data (e.g. bimodal distributions)
  3. is fast

(a good tool should also be reliable, easily available, and well supported)

Choose two.

5.5 Pros/cons

  • edgeR: general designs, fast, reliable | negative binomial distribution (unimodal)
  • MAST: general designs, single-cell model | fast, reliable(?)
  • BPSC: general designs, single-cell model | fast(?), reliable(?),
  • scde: single-cell model | fast, reliable(?), general designs
  • scDD: single-cell model (non-parametric) | fast(?), reliable(?), general designs(?)

5.6 Interpretation of results

  • Specific threshold on p-value and/or fold-change
  • Comparison with the results of bulk experiments

5.7 Tasks: DE analysis with edgeR

  • Fit a model in edgeR with both Lab and Genotype effects
  • Find genes that are DE between different genotypes

Tips:

  • Use the convertTo function in scran to convert an SCESet to a DGEList for edgeR analysis. (This retains necessary metadata, including normalisation factors previously computed.)

5.8 edgeR estimate dispersion

library(edgeR)
dge <- convertTo(sce, "edgeR")
design <- model.matrix(~Lab + Genotype, data = pData(sce))
head(design)
##                        (Intercept) LabZam GenotypePSEN1
## S35P1P160125UOXZP21C05           1      1             1
## S78P3P160125UOXCP21F10           1      0             1
## S44P1P160125UOXZP21D06           1      1             1
## S48P2P160125UOXCC21H06           1      0             0
## S71P4P160125UOXZC21G09           1      1             0
## S24P3P160125UOXCP21H03           1      0             1
dge <- estimateDisp(dge, design, trend.method = "none")

5.9 edgeR plot biological CV

plotBCV(dge)

5.10 edgeR fit quasi-likelihood GLM

fit <- glmFit(dge, design, prior.count = 0.5)
results <- glmLRT(fit)
summary(decideTestsDGE(results))
##    [,1] 
## -1   123
## 0  12439
## 1    202
topTags(results)
## Coefficient:  GenotypePSEN1 
##                      logFC    logCPM        LR       PValue          FDR
## ENSG00000187653  2.0466186  5.485889 198.45372 4.542261e-45 5.797742e-41
## ENSG00000215030  2.2978459  5.617554 195.14951 2.389905e-44 1.525238e-40
## ENSG00000213058  2.3825384  3.833270 180.92401 3.045605e-41 1.295804e-37
## ENSG00000224114  2.0730867  3.652758 156.94785 5.254660e-36 1.676762e-32
## ENSG00000215559 -3.5982078  4.478981 127.41208 1.509414e-29 3.853232e-26
## ENSG00000236876  1.7600141  3.548399 120.69503 4.456324e-28 9.480087e-25
## ENSG00000205542  1.2916156 11.639844 117.30226 2.464891e-27 4.494553e-24
## ENSG00000227097  2.0212006  3.977080 112.21216 3.210497e-26 5.122349e-23
## ENSG00000075624  0.9057414 13.272414  61.73445 3.930429e-15 5.574221e-12
## ENSG00000184009  0.8840725 12.939755  59.31414 1.344073e-14 1.715575e-11

5.11 edgeR MA plot

n_de <- sum(abs(decideTestsDGE(results, p.value = 0.001)))
de_tags <- rownames(topTags(results, n = n_de)$table)
plotSmear(results, de.tags = de_tags, smooth.scatter = TRUE, 
          lowess = TRUE, cex = 0.4)

5.12 Expression for top edgeR DE genes

plotExpression(sce, de_tags[1:6], x = "Genotype", colour_by = "Lab", ncol = 3)

5.13 M3Drop - Michaelis-Menten modeling of dropouts

library("M3Drop")
norm_counts <- t(t(counts(sce)) / sizeFactors(sce))
m3_fit <- M3DropDropoutModels(norm_counts)

5.14 M3Drop - DE genes

m3_de <- M3DropDifferentialExpression(norm_counts, 
            mt_method = "fdr", mt_threshold = 0.01)

5.15 M3Drop - heatmap (Genotype)

heat_out <- M3DropExpressionHeatmap(m3_de$Gene, norm_counts, 
            cell_labels = sce$Genotype)

5.16 M3Drop - heatmap (Lab)

heat_out <- M3DropExpressionHeatmap(m3_de$Gene, norm_counts, 
            cell_labels = sce$Lab)

Beware batch effects!

5.17 scDD

In principle, the below should work…but it did not for me.

As it is not on CRAN or Bioconductor, there is no established forum for asking questions and obtaining advice, and (it would appear here) that it is not as reliable as it should be.

eset <- ExpressionSet(assayData(sce))
exprs(eset) <- get_exprs(sce, "exprs")
eset$condition <- as.numeric(as.factor(sce$Genotype))
sdd_res <- scDD(eset, testZeroes = FALSE)

I’ll leave it with you to explore further options for DE testing. Many good tools are available, but it’s still the wild west out there.